Checkpoint简介

Flink定期保存数据，failover后从上次成功的保存点处恢复，并提供Exactly-Once的投递保障机制

CheckpointCoordinator

// 恢复保存点
public boolean restoreSavepoint(
        String savepointPointer,
        boolean allowNonRestored,
        Map<JobVertexID, ExecutionJobVertex> tasks,
        ClassLoader userClassLoader) throws Exception {

    Preconditions.checkNotNull(savepointPointer, "The savepoint path cannot be null.");

    LOG.info("Starting job {} from savepoint {} ({})",
            job, savepointPointer, (allowNonRestored ? "allowing non restored state" : ""));
    // 从指定目录获取hdfs的地址
    final CompletedCheckpointStorageLocation checkpointLocation = checkpointStorage.resolveCheckpoint(savepointPointer);

    // 1、加载 metadata 信息
    // 2、生成operator 到 task 的映射
    // 3、检查 并行度
    // 4、转换这次的 savepoint为checkpoint，以便失败后恢复
    CompletedCheckpoint savepoint = Checkpoints.loadAndValidateCheckpoint(
            job, tasks, checkpointLocation, userClassLoader, allowNonRestored);

    // 将要恢复的checkpoint信息写到zk，并异步删除旧的checkpoint
    completedCheckpointStore.addCheckpoint(savepoint);

    // 重置checkpoint 计数器
    long nextCheckpointId = savepoint.getCheckpointID() + 1;
    checkpointIdCounter.setCount(nextCheckpointId);

    LOG.info("Reset the checkpoint ID of job {} to {}.", job, nextCheckpointId);
    // 从最近一次 Checkpoint 处恢复 State
    // 获取OperatorState，分配state
    return restoreLatestCheckpointedStateInternal(new HashSet<>(tasks.values()), true, true, allowNonRestored);
}